PCA: an orthogonal linear transformation which projects the genotype data into the new reduced dimensional space such that the greater variance comes in an order
UMAP: a novel non-linear dimensionality reduction technique based on Riemannian geometry and algebraic topology to model and preserve the high-dimensional topology of data points in the lowdimensional space
PCA–UMAP: an application of UMAP for principal components of genotype data to be computationally more advantageous and statistically less noisy
UMAP has several hyperparameters that can have a signficant impact on the resulting embedding.
n_neighbors: The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100 (default = 15).
min_dist: The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the “spread“ value. (default = 0.1)
n_components: The dimension of the space to embed into (defaults = 2)metric: The metric to use to compute distances in high dimensional space (default = euclidean)n pca: For genetic ancestry the number of PCs used in the data input. Adding more components results in progressively finer clusters until approximately 20 populations appear using 15 components; adding further components converges to results similar to using the entire genotype data| parameter | umapr | umap | uwot |
|---|---|---|---|
| include_input | TRUE | NA | NA |
| n_neighbors | 15 | 15 | 15 |
| n_components | 2 | 2 | 2 |
| metric | euclidean | euclidean | euclidean |
| n_epochs | NULL | 200 | NULL |
| learning_rate | 1 | NA | 1 |
| alpha | 1 | 1 | NA |
| init | spectral | spectral | spectral |
| spread | 1 | 1 | 1 |
| min_dist | 0.1 | 0.1 | 0.01 |
| set_op_mix_ratio | 1 | 1 | 1 |
| local_connectivity | 1 | 1 | 1 |
| repulsion_strength | 1 | NA | 1 |
| bandwidth | 1 | 1 | 1 |
| gamma | 1 | 1 | NA |
| negative_sample_rate | 5 | 5 | 5 |
| transform_queue_size | 4 | NA | NA |
| a | NULL | NULL | NULL |
| b | NULL | NULL | NULL |
| random_state | NULL | NULL | NA |
| metric_kwds | dict() | NA | NA |
| angular_rp_forest | FALSE | NA | NA |
| target_n_neighbors | -1 | NA | n_neighbors |
| target_metric | categorical | NA | euclidean |
| target_metric_kwds | dict() | NA | NA |
| target_weight | 0.5 | NA | 0.5 |
| transform_seed | 42 | NA | NA |
## Provenence
country <- read_tsv("/Users/sheaandrews/GitCode/MitoImputePrep/DerivedData/ReferencePanel_v6/ReferencePanel_v1_highQual_MAF0.01_filtered_countryInfo.ped") %>%
select(Family, Individual, geographic_data, Continent, Country, Region)
## Haplogroup assignments
dat <- generate_snp_data_fixed("/Users/sheaandrews/GitCode/MitoImpute/ReferencePanel/ReferencePanel.map",
"/Users/sheaandrews/GitCode/MitoImpute/ReferencePanel/ReferencePanel.ped")
data(nodes)
classifications.raw <- HiMC::getClassifications(dat)
classifications <- as_tibble(classifications.raw[-nrow(classifications.raw),]) #remove Mitochondria_Information row
ped <- dat %>%
magrittr::set_colnames(
c("Family", "Individual", "Father", "Mother", "Sex", "Phenotype",
paste0('mt', colnames(magrittr::extract(., 7:ncol(.))))
)
) %>%
as_tibble() %>%
na_if(., "0 0") %>%
mutate_at(vars(starts_with("mt")), as_factor) %>%
mutate_at(vars(starts_with("mt")), as.numeric) %>%
left_join(classifications) %>%
left_join(country) %>%
mutate(macro = ifelse(str_detect(haplogroup, "^L"),
substr(haplogroup, start = 1, stop = 2),
substr(haplogroup, start = 1, stop = 1)),
Continent = replace_na(Continent, "Unknown"),
supclu = case_when(
macro %in% c("L0", "L1", "L2", "L3") ~ "L",
macro %in% c("Q", "G", "E", "D", "C", "Z", "M") ~ "M",
macro %in% c("A", "S", "O", "Y", "I", "W", "X", "N") ~ "N",
macro %in% c("R", "P", "B", "F", "J", "T", "U", "K", "H", "V") ~ "R"
)) %>%
select(Family, Individual, Father, Mother, Sex, Phenotype, haplogroup, macro, supclu,
geographic_data, Continent, Country, Region, everything()) %>%
filter(macro %nin% c("u")) %>%
filter(!is.na(macro))
Run PCA analysis of mtReference Panel
## PCA
res.pca <- ped %>%
select(starts_with("mt")) %>%
PCA(., scale.unit = F, ncp = 10, graph = FALSE)
ind.pca <- get_pca_ind(res.pca)$coord %>%
as_tibble() %>%
bind_cols(select(ped, Individual, haplogroup, macro, supclu, Continent)) %>%
select(Individual, haplogroup, macro, Continent, supclu, everything())
eig.val <- get_eigenvalue(res.pca)
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))
ggplot(ind.pca, aes(x = Dim.1, y = Dim.2, colour = macro)) +
scale_color_manual(values = cols25()) +
geom_point() + theme_bw()
ggplot(ind.pca, aes(x = Dim.1, y = Dim.2, colour = supclu)) +
scale_color_manual(values = cols25()) +
geom_point() + theme_bw()
ggplot() +
geom_point(data = filter(ind.pca, Continent == "Unknown"), aes(x = Dim.1, y = Dim.2, colour = Continent)) +
geom_point(data = filter(ind.pca, Continent != "Unknown"), aes(x = Dim.1, y = Dim.2, colour = Continent)) +
scale_color_manual(values = cols25()) +
theme_bw()
plot_ly(ind.pca, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~macro, colors = cols25(),
type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Haplogroup: ', macro,
'</br> PC1: ', round(Dim.1, 2),
'</br> PC2: ', round(Dim.2, 2),
'</br> PC3: ', round(Dim.3, 2)))
plot_ly(ind.pca, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~supclu, colors = cols25(),
type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Haplogroup: ', macro,
'</br> PC1: ', round(Dim.1, 2),
'</br> PC2: ', round(Dim.2, 2),
'</br> PC3: ', round(Dim.3, 2)))
plot_ly(ind.pca, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~Continent, colors = cols25(),
type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Haplogroup: ', macro,
'</br> PC1: ', round(Dim.1, 2),
'</br> PC2: ', round(Dim.2, 2),
'</br> PC3: ', round(Dim.3, 2)))
embedding_mtref_2d <- select(ind.pca, starts_with("Dim")) %>%
uwot::umap(., min_dist = 0.5, n_neighbor = 100, n_components = 2) %>%
as.tibble() %>%
bind_cols(select(ped, Individual, haplogroup, macro, supclu, Continent)) %>%
rename(UMAP1 = V1, UMAP2 = V2)
ggpubr::ggarrange(
ggplot(embedding_mtref_2d, aes(x = UMAP1, y = UMAP2, color = macro)) +
geom_point() +
scale_color_manual(values = cols25()) +
theme_bw(),
ggplot(embedding_mtref_2d, aes(x = UMAP1, y = UMAP2, color = supclu)) +
geom_point() +
scale_color_manual(values = cols25()) +
theme_bw(),
ggplot() +
geom_point(data = filter(embedding_mtref_2d, Continent == "Unknown"), aes(x = UMAP1, y = UMAP2, colour = Continent)) +
geom_point(data = filter(embedding_mtref_2d, Continent != "Unknown"), aes(x = UMAP1, y = UMAP2, colour = Continent)) +
scale_color_manual(values = cols25()) +
theme_bw()
)
embedding_mtref_3d <- select(ind.pca, starts_with("Dim")) %>%
uwot::umap(., min_dist = 0.5, n_neighbor = 100, n_components = 3) %>% as.tibble() %>%
bind_cols(select(ped, Individual, haplogroup, macro, supclu, Continent)) %>%
rename(UMAP1 = V1, UMAP2 = V2, UMAP3 = V3)
plot_ly(embedding_mtref_3d, x = ~UMAP2, y = ~UMAP1, z = ~UMAP3, color = ~macro, colors = cols25(), type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Haplogroup: ', macro,
'</br> PC1: ', round(UMAP1, 2),
'</br> PC2: ', round(UMAP2, 2),
'</br> PC3: ', round(UMAP3, 2)))